1 Initial setup

1.1 Setup of the environment:

devtools::install_github("sdam-au/sdam") # loading SDAM custom package, if not working try devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
#devtools::install_github("mplex/cedhar", subdir="pkg/sdam")
library(tidyverse)
library(sdam)
library(jsonlite)
library(leaflet)
library(tidytext)

1.2 Loading data

  1. Load the dataset, if you have Sciencedata.dk credentials
resp = request("EDH_text_cleaned_2020-10-09.json", path="/sharingin/648597@au.dk/SDAM_root/SDAM_data/EDH/public", method="GET", cred=mycred_secret)
  1. Make a list and tibble from the request function
list_json <- jsonlite::fromJSON(resp)
EDH_tibble <- as_tibble(list_json)
  1. Display the first 6 records
head(EDH_tibble)

2 Tidy text analysis of the clean_text_interpretive_word column

2.1 Tokenizing words, splitting on an empty space

EDH_tokenized <- EDH_tibble %>% 
  unnest_tokens(word, clean_text_interpretive_word, token = stringr::str_split, pattern = " ") %>% 
  drop_na(word) %>%
  print()
## # A tibble: 1,136,004 x 74
##    responsible_ind… type_of_inscrip… letter_size not_after literature
##    <chr>            <chr>            <chr>       <chr>     <chr>     
##  1 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  2 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  3 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  4 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  5 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  6 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  7 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  8 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
##  9 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
## 10 Feraudi          epitaph          3.2-2 cm    0130      AE 1983, …
## # … with 1,135,994 more rows, and 69 more variables: work_status <chr>,
## #   height <chr>, diplomatic_text <chr>, people <list>, depth <chr>,
## #   material <chr>, type_of_monument <chr>, province_label <chr>, width <chr>,
## #   transcription <chr>, country <chr>, uri <chr>, findspot_ancient <chr>,
## #   last_update <chr>, modern_region <chr>, findspot_modern <chr>,
## #   language <chr>, id <chr>, edh_geography_uri <chr>, commentary <chr>,
## #   trismegistos_uri <chr>, not_before <chr>, external_image_uris <list>,
## #   fotos <list>, coordinates <list>, idno_tm <chr>, placenames_refs <list>,
## #   text_edition <chr>, origdate_text <chr>, layout_execution <chr>,
## #   layout_execution_text <chr>, support_objecttype <chr>,
## #   support_objecttype_text <chr>, support_material <chr>,
## #   support_material_text <chr>, support_decoration <chr>, keywords_term <chr>,
## #   keywords_term_text <chr>, type_of_inscription_clean <chr>,
## #   type_of_inscription_certainty <chr>, height_cm <dbl>, width_cm <dbl>,
## #   depth_cm <dbl>, material_clean <chr>, type_of_monument_clean <chr>,
## #   type_of_monument_certainty <chr>, province_label_clean <chr>,
## #   province_label_certainty <chr>, country_clean <chr>,
## #   country_certainty <chr>, findspot_ancient_clean <chr>,
## #   findspot_ancient_certainty <chr>, modern_region_clean <chr>,
## #   modern_region_certainty <chr>, findspot_modern_clean <chr>,
## #   findspot_modern_certainty <chr>, findspot_clean <chr>,
## #   findspot_certainty <chr>, origdate_text_clean <chr>,
## #   clean_text_conservative <chr>, clean_text_interpretive_sentence <chr>,
## #   findspot <chr>, year_of_find <chr>, present_location <chr>, religion <chr>,
## #   geography <chr>, social_economic_legal_history <chr>, military <chr>,
## #   word <chr>

2.2 Counting the most common words

EDH_tokenized %>% count(word, sort = TRUE) %>% filter(n > 5000) %>% mutate(word = reorder(word, 
    n)) %>% print()
## # A tibble: 22 x 2
##    word        n
##    <fct>   <int>
##  1 et      36364
##  2 dis     12165
##  3 manibus 12024
##  4 vixit    9544
##  5 in       8781
##  6 annos    7947
##  7 filius   7536
##  8 annorum  7051
##  9 est      6880
## 10 i        6827
## # … with 12 more rows

2.3 Number of total words on inscriptions per Roman province

EDH_tokenized %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  summarise(total = sum(n)) %>% 
  mutate(province_label_clean = reorder(province_label_clean, total)) -> words_total_province
## `summarise()` ungrouping output (override with `.groups` argument)
head(words_total_province)
words_total_province %>% 
  ggplot(aes(total, province_label_clean)) +
  geom_col(fill = "darkblue", width = 0.7) +
  theme_classic() +
  labs(x = "Number of words", y = "Province name", title = "Number of total words on inscriptions per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10) 

2.4 The most common words by Roman provinces

EDH_tokenized %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  filter(n > 1000) %>%
  mutate(province_label_clean = reorder(province_label_clean, n)) %>% 
  ggplot(aes(y=province_label_clean, x=n)) +
  geom_col(aes(fill=word), width=0.7) +
  labs(x = "Number of words", y = "Province name", title = "The most common words on inscriptions per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10) 

2.5 The most common words by type of an inscription (epitaph)

EDH_tokenized %>% 
  count(type_of_inscription_clean, word, sort = TRUE) %>% 
  group_by(type_of_inscription_clean) %>% 
  filter(type_of_inscription_clean == "epitaph") %>% 
  filter(n > 1000) %>% 
  mutate(word = reorder(word, n)) -> words_epitaph

total_words_epitaph<- sum(words_epitaph$n)

words_epitaph %>% 
  ggplot(aes(y=word, x=n, color=n)) +
  geom_col(width=0.7) + 
  scale_color_gradient(low="blue", high="red") + 
  theme_minimal() +
  theme_linedraw(base_size = 9) +
  labs(x = "Number of words", y = "Word", title = "The most common words on epitaphs", subtitle = "n = 123,039 words")

2.6 The most common words by type of an inscription (milestone)

EDH_tokenized %>% 
  count(type_of_inscription_clean, word, sort = TRUE) %>% 
  group_by(type_of_inscription_clean) %>% 
  filter(type_of_inscription_clean == "mile-/leaguestone") %>% 
  filter(n > 100) %>% 
  mutate(word = reorder(word, n)) -> words_milestone
words_milestone
total_words_milestone <- sum(words_milestone$n)

words_milestone %>% 
  ggplot(aes(y=word, x=n, color=n)) +
  geom_col(width=0.6) + 
  scale_color_gradient(low="blue", high="red") + 
  theme_minimal() +
  theme_linedraw(base_size = 9) +
  labs(x = "Number of words", y = "Word", title = "The most common words on milestones", subtitle = "n = 24,986 words")

ggsave(filename = "../figures/EDH_milestone_common_words.png", width = 8, height = 8)

2.7 The most common words on milestones per province

EDH_tokenized %>% 
  filter(type_of_inscription_clean == "mile-/leaguestone") %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  filter(n > 50) %>%
  mutate(province_label_clean = reorder(province_label_clean, n)) %>% 
  ggplot(aes(y=province_label_clean, x=n)) +
  geom_col(aes(fill=word), width=0.7) +
  labs(x = "Number of words", y = "Province name", title = "The most common words on milestones per Roman province", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10) 

library(wordcloud)
## Loading required package: RColorBrewer
EDH_tokenized %>% 
  filter(type_of_inscription_clean == "mile-/leaguestone") %>% 
  count(province_label_clean, word, sort = TRUE) %>% 
  group_by(province_label_clean) %>% 
  filter(n > 50) %>%
  mutate(province_label_clean = reorder(province_label_clean, n)) %>% 
  with(wordcloud(word, n, max.words = 200))

# Frequency of words Source: https://www.tidytextmining.com/tfidf.html Using term frequency and inverse document frequency allows us to find words that are characteristic for one document within a collection of documents.

insc_types_words <- EDH_tokenized %>% 
  count(type_of_inscription_clean, word, sort = TRUE)

total_words <- insc_types_words %>% 
  group_by(type_of_inscription_clean) %>% 
  summarize(total = sum(n))
## `summarise()` ungrouping output (override with `.groups` argument)
insc_types_words <- left_join(insc_types_words, total_words)
## Joining, by = "type_of_inscription_clean"
insc_types_words
ggplot(insc_types_words, aes(n/total, fill = type_of_inscription_clean)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~type_of_inscription_clean, ncol = 4, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4509 rows containing non-finite values (stat_bin).
## Warning: Removed 18 rows containing missing values (geom_bar).

ggsave(filename = "../figures/EDH_freq_words_insc_type.png", width = 8, height = 8)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4509 rows containing non-finite values (stat_bin).

## Warning: Removed 18 rows containing missing values (geom_bar).

2.8 Rank of words (Zip’s law)

freq_by_rank <- insc_types_words %>% 
  group_by(type_of_inscription_clean) %>% 
  mutate(rank = row_number(), 
         `term frequency` = n/total)

freq_by_rank
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = type_of_inscription_clean)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  scale_x_log10() +
  scale_y_log10()

rank_subset <- freq_by_rank %>% 
  filter(rank < 500,
         rank > 10)

lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Call:
## lm(formula = log10(`term frequency`) ~ log10(rank), data = rank_subset)
## 
## Coefficients:
## (Intercept)  log10(rank)  
##     -0.9723      -0.9194
freq_by_rank %>% 
  ggplot(aes(rank, `term frequency`, color = type_of_inscription_clean)) + 
  geom_line(size = 1.1, alpha = 0.8, show.legend = TRUE) + 
  geom_abline(intercept = -0.62, slope = -1.1, color = "gray50", linetype = 2) +
  scale_x_log10() +
  scale_y_log10()

Commentary: EDH corpus uses a lower percentage of the most common words than many collections of language.

2.9 Term frequency vs inverse document frequency

insc_types_words <- insc_types_words %>%
  bind_tf_idf(word, type_of_inscription_clean, n)

insc_types_words
insc_types_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
insc_types_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(type_of_inscription_clean) %>% 
  top_n(15) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill = type_of_inscription_clean)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~type_of_inscription_clean, ncol = 4, scales = "free_y") +
  coord_flip() +
  labs(x = "word", y = "tf-idf", title = "Term frequency - inverse document frequency (tf-idf) by type of inscription", subtitle = "EDH dataset, n = 81,476 inscriptions") +
  theme_linedraw(base_size = 10)
## Selecting by tf_idf

ggsave("../figures/EDH_tf_idf_insc_type.png", width = 16, height = 16)

2.10 Custom stopwords list

mystopwords <- tibble(word = c("et", "in", "qui", "i", "v", "ii", "ex"))

3 N-grams and correlations

3.1 Bigrams

insc_bigrams <- EDH_tibble %>%
  select(clean_text_interpretive_word, type_of_inscription_clean, province_label_clean) %>% 
  unnest_tokens(bigram, clean_text_interpretive_word, token = "ngrams", n = 2)
head(insc_bigrams)
insc_bigrams %>% 
  count(bigram, sort = TRUE)
bigrams_separated <- insc_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_separated %>% 
  count(word1, word2, sort = TRUE)

3.1.1 Analysis of bi-grams

What other words occur together with the word passuum.

bigrams_separated %>%
  filter(word2 == "passuum") %>%
  count(type_of_inscription_clean, word1, sort = TRUE)
bigrams_separated %>%
  filter(word1 == "passuum") %>%
  count(type_of_inscription_clean, word2, sort = TRUE)

3.1.2 Frequencies in bigram

bigram_tf_idf <- insc_bigrams%>%
  count(type_of_inscription_clean, bigram) %>%
  bind_tf_idf(bigram, type_of_inscription_clean, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  mutate(bigram = factor(bigram, levels = rev(unique(bigram)))) %>% 
  group_by(type_of_inscription_clean) %>% 
  top_n(10) %>% 
  ungroup() %>%
  ggplot(aes(bigram, tf_idf, fill = type_of_inscription_clean)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~type_of_inscription_clean, ncol = 3, scales = "free_y") +
  coord_flip() +
  theme_linedraw(base_size = 10) 
## Selecting by tf_idf

ggsave("../figures/EDH_bigrams_tf_idf_insc_type.png", width = 20, height = 20)

3.1.3 Visualising bigrams as network

library(igraph)
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
bigram_graph<- bigrams_separated %>% 
  count(word1, word2, sort = TRUE) %>% 
  filter(n > 500) %>%
  graph_from_data_frame() 
bigram_graph
## IGRAPH 9fea7f5 DN-- 105 87 -- 
## + attr: name (v/c), n (e/n)
## + edges from 9fea7f5 (vertex names):
##  [1] dis       ->manibus   vixit     ->annos     votum     ->solvit   
##  [4] solvit    ->libens    tribunicia->potestate manibus   ->sacrum   
##  [7] libens    ->merito    hic       ->situs     situs     ->est      
## [10] bene      ->merenti   iovi      ->optimo    optimo    ->maximo   
## [13] sibi      ->et        vixit     ->annis     imperatori->caesari  
## [16] imperator ->caesar    hic       ->sita      pro       ->salute   
## [19] terra     ->levis     sita      ->est       tibi      ->terra    
## [22] et        ->i         sit       ->tibi      pro       ->praetore 
## + ... omitted several edges
library(ggraph)
set.seed(1000)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

ggsave("../figures/EDH_bigrams_networks.png", width = 10, height = 10)

3.1.3.1 Another network graph

set.seed(1000)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 4) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

ggsave("../figures/EDH_bigrams_networks_2.png", width = 10, height = 10)

3.2 Tri-grams

insc_trigram <- EDH_tibble %>%
  select(clean_text_interpretive_word, type_of_inscription_clean, province_label_clean) %>%
  unnest_tokens(trigram, clean_text_interpretive_word, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  count(word1, word2, word3, sort = TRUE)
insc_trigram

3.3 Counting and correlating pairs of words with the widyr package

library(widyr)

# count words co-occuring within sections
word_pairs<- EDH_tokenized %>% 
  pairwise_count(word, id, sort = TRUE)
## Warning: `distinct_()` is deprecated as of dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
word_pairs
word_pairs %>%
  filter(item1 == "votum")

3.3.1 Pairwise correlation

Correlation among words, which indicates how often they appear together relative to how often they appear separately.

word_cors <- EDH_tokenized %>%
  group_by(word) %>%
  filter(n() >= 100) %>%
  pairwise_cor(word, id, sort = TRUE)

word_cors
word_cors %>%
  filter(item1 == "votum")

3.4 Exploration of matrix

# how many words has milestone
sum(EDH_dfm["mile-/leaguestone",])
## [1] 39424
max(EDH_dfm["mile-/leaguestone",])
## [1] 1102
milestone <- EDH_dfm["mile-/leaguestone",]